In [1]:
#!pip install praw
Collecting praw
  Downloading praw-7.6.1-py3-none-any.whl (188 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 188.8/188.8 KB 2.0 MB/s eta 0:00:0000:0100:01
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Requirement already satisfied: websocket-client>=0.54.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from praw) (1.4.2)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Requirement already satisfied: requests<3.0,>=2.6.0 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from prawcore<3,>=2.1->praw) (2.28.1)
Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2022.12.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (1.26.13)
Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.4)
Requirement already satisfied: charset-normalizer<3,>=2 in /opt/homebrew/Caskroom/miniforge/base/envs/virt/lib/python3.8/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.1.1)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.6.1 prawcore-2.3.0 update-checker-0.18.0
In [216]:
import praw
import pandas as pd
 
reddit_read_only = praw.Reddit(client_id="-FkFx07VGHhRLGZW9CNuRw",         # your client id
                               client_secret="LP6ZvPq-t4OmaNm8xtIjkxHLRC7N0A",      # your client secret
                               user_agent="MK scraper")        # your user agent
 
 
subrdit = reddit_read_only.subreddit("AmITheAsshole")
 
# Display the name of the Subreddit
#print("Display Name:", subrdit.display_name)
 
# Display the description of the Subreddit
#print("Description:", subrdit.description)
In [217]:
subreddit = reddit_read_only.subreddit("AmITheAsshole")
 
for post in subreddit.top(limit=5):
    print(post.title)
    print()
AITA for telling my wife the lock on my daughter's door does not get removed til my brother inlaw and his daughters are out of our house?

META: This sub is moving towards a value system that frequently doesn't align with the rest of the world

UPDATE, AITA for despising my mentally handicap sister?

AITA For suing my girlfriend after she had my 1967 impala project taken to the scrapyard?

AITA for bringing my SIL’s wallet to the restaurant when she conveniently always forgets it?

In [218]:
posts = subreddit.top("year", limit = 800)

posts_dict = {'title' : [], 'body': [], 'score': [], 'id': [], 'top_comment_body' : [], 'top_comment_score': [], 'url': []}

i=0
for post in posts:
    # Title of each post
    posts_dict["title"].append(post.title)
     
    # Text inside a post
    posts_dict["body"].append(post.selftext)
     
    # Unique ID of each post
    posts_dict["id"].append(post.id)
     
    # The score of a post
    posts_dict["score"].append(post.score)
    
    # Text inside the top comment of the post
    posts_dict["top_comment_body"].append(post.comments[1].body)
    
    # Score of the top comment of the post
    posts_dict["top_comment_score"].append(post.comments[1].score)
     
    # URL of each post
    posts_dict["url"].append(post.url)
    
    if i%10 == 0:
        print("Done with post number ", i)
    i += 1
    
# Saving the data in a pandas dataframe
top_posts = pd.DataFrame(posts_dict)
top_posts
Done with post number  0
Done with post number  10
Done with post number  20
Done with post number  30
Done with post number  40
Done with post number  50
Done with post number  60
Done with post number  70
Done with post number  80
Done with post number  90
Done with post number  100
Done with post number  110
Done with post number  120
Done with post number  130
Done with post number  140
Done with post number  150
Done with post number  160
Done with post number  170
Done with post number  180
Done with post number  190
Done with post number  200
Done with post number  210
Done with post number  220
Done with post number  230
Done with post number  240
Done with post number  250
Done with post number  260
Done with post number  270
Done with post number  280
Done with post number  290
Done with post number  300
Done with post number  310
Done with post number  320
Done with post number  330
Done with post number  340
Done with post number  350
Done with post number  360
Done with post number  370
Done with post number  380
Done with post number  390
Done with post number  400
Done with post number  410
Done with post number  420
Done with post number  430
Done with post number  440
Done with post number  450
Done with post number  460
Done with post number  470
Done with post number  480
Done with post number  490
Done with post number  500
Done with post number  510
Done with post number  520
Done with post number  530
Done with post number  540
Done with post number  550
Done with post number  560
Done with post number  570
Done with post number  580
Done with post number  590
Done with post number  600
Done with post number  610
Done with post number  620
Done with post number  630
Done with post number  640
Done with post number  650
Done with post number  660
Done with post number  670
Done with post number  680
Done with post number  690
Done with post number  700
Done with post number  710
Done with post number  720
Done with post number  730
Done with post number  740
Done with post number  750
Done with post number  760
Done with post number  770
Done with post number  780
Done with post number  790
Out[218]:
title body score id top_comment_body top_comment_score url
0 AITA for bringing my SIL’s wallet to the resta... Edit: update on profile\n\nMy (f28) SIL “Amy” ... 68512 x2k5kv NTA. Stone cold busted. Next time she books an... 1442 https://www.reddit.com/r/AmItheAsshole/comment...
1 AITA for bringing up my brother's "premature" ... I am a nurse practitioner and I am the primary... 56259 zvmflw You can tell the family about the time you wer... 678 https://www.reddit.com/r/AmItheAsshole/comment...
2 AITA for not taking down my video that was a g... I have a sister that’s 6 years older than me. ... 54743 wyjbjs NTA\n\nMy parents missed my wedding too all be... 1578 https://www.reddit.com/r/AmItheAsshole/comment...
3 UPDATE AITA for walking out of the Airport whe... Hello!.\n\n\nI don't know where to begin...it'... 51464 ur2l3s I'm sorry you are going through this, but I'm ... 18671 https://www.reddit.com/r/AmItheAsshole/comment...
4 AITA for walking out of the Airport when I saw... \n\nI F30 don't have the best relationship wit... 50024 unhse2 Definitely NTA. You know that if you had sucke... 9416 https://www.reddit.com/r/AmItheAsshole/comment...
... ... ... ... ... ... ... ...
795 AITA for saying I will never host another fami... We had a family get together for Father’s Day.... 14260 vge8ha NTA but a 45 year old man licked 6 slices of p... 12475 https://www.reddit.com/r/AmItheAsshole/comment...
796 AITA for bothering a woman at home? My son (4) had a sleepover last night with a f... 14266 10aeoi3 YTA.\n\nText - no response\n\nCall - no respon... 3533 https://www.reddit.com/r/AmItheAsshole/comment...
797 AITA for having a "scary" kitty around children? I (m27) have a cat named stargazer. I adopted... 14251 tiq83r Guess they just lost their free babysitter the... 5976 https://www.reddit.com/r/AmItheAsshole/comment...
798 AITA for sticking to my(29M) guns when it came... When my wife(31F) and I(29M) were trying to ge... 14227 uvoj8s “Out of left field and creepy” is exactly righ... 1 https://www.reddit.com/r/AmItheAsshole/comment...
799 AITA for not telling my fiancé where I go on S... So super weird situation and need a judgement.... 14219 sexfa3 Girl wants you back. Who would bring a charger... 82 https://www.reddit.com/r/AmItheAsshole/comment...

800 rows × 7 columns

In [219]:
top_posts.to_csv('TM_project/reddit_posts.csv', index=False)
In [220]:
import pickle # for loading (and saving) the previously web scraped data

import pandas as pd # for processing data in dataframes
import matplotlib.pyplot as plt # for plotting

import re # for cleaning textual data (uses regular expressions ouch!)
from collections import Counter # for counting tokens occurences
import math # for calculations

import nltk
from nltk.tokenize import word_tokenize # for tokenization
from nltk.stem import PorterStemmer # for stemming
from nltk.corpus import stopwords

# import stop_words # source: https://pypi.org/project/stop-words/#installation
# from stop_words import get_stop_words # alternative stopwords list

import gensim
from gensim import corpora # for: Dictionary(), word2bow()
from gensim import models # for: TfidfModel()

import statistics # for: quantiles()

import numpy as np # for some maths

import time # for measuring time of computation

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
In [221]:
#removing \n
post_texts = top_posts["body"].map(lambda x: re.sub('\n', ' ', x))

#removing all numbers
post_texts = post_texts.map(lambda x: re.sub(r'[0-9]+', '', x))

#removing ,\!?/:;''()``’“-”—#
post_texts = post_texts.map(lambda x: re.sub("[,\!?/:;''()``’“-”—#]", '', x))

#removing .
post_texts = post_texts.map(lambda x: re.sub(r"([.]+)", '', x))

#all letters to lover case
post_texts = post_texts.map(lambda x: x.lower())

#removing one-letter words
post_texts = post_texts.map(lambda x: re.sub(r'\b\w\b', '', x))

post_texts
Out[221]:
0      edit update on profile  my  sil amy  always co...
1       am  nurse practitioner and  am the primary ca...
2       have  sister thats  years older than me   my ...
3      hello    dont know where to beginits been an a...
4          dont have the best relationship with my hu...
                             ...                        
795    we had  family get together for fathers day my...
796    my son  had  sleepover last night with  friend...
797      have   cat named stargazer  adopted her arou...
798    when my wifef and im were trying to get pregna...
799    so super weird situation and need  judgement  ...
Name: body, Length: 800, dtype: object
In [222]:
#word_tokenize() applied to every single text
for i in range(0,len(post_texts)):
    post_texts[i] = word_tokenize(post_texts[i])
    
post_texts
Out[222]:
0      [edit, update, on, profile, my, sil, amy, alwa...
1      [am, nurse, practitioner, and, am, the, primar...
2      [have, sister, thats, years, older, than, me, ...
3      [hello, dont, know, where, to, beginits, been,...
4      [dont, have, the, best, relationship, with, my...
                             ...                        
795    [we, had, family, get, together, for, fathers,...
796    [my, son, had, sleepover, last, night, with, f...
797    [have, cat, named, stargazer, adopted, her, ar...
798    [when, my, wifef, and, im, were, trying, to, g...
799    [so, super, weird, situation, and, need, judge...
Name: body, Length: 800, dtype: object
In [223]:
ps = PorterStemmer()

for i in range(0,len(post_texts)):
    
    words = []
    for word in post_texts[i]:
        words.append(ps.stem(word)) #stems every token in document and append it to a list
                                    #it takes few minutes

post_texts
Out[223]:
0      [edit, update, on, profile, my, sil, amy, alwa...
1      [am, nurse, practitioner, and, am, the, primar...
2      [have, sister, thats, years, older, than, me, ...
3      [hello, dont, know, where, to, beginits, been,...
4      [dont, have, the, best, relationship, with, my...
                             ...                        
795    [we, had, family, get, together, for, fathers,...
796    [my, son, had, sleepover, last, night, with, f...
797    [have, cat, named, stargazer, adopted, her, ar...
798    [when, my, wifef, and, im, were, trying, to, g...
799    [so, super, weird, situation, and, need, judge...
Name: body, Length: 800, dtype: object
In [224]:
stop_words = nltk.corpus.stopwords.words('english') #one of stopwords dictionaries available in Python

# cleaning stopwords
stop_words = pd.Series(stop_words).map(lambda x: re.sub('\n', '', x))
stop_words = stop_words.map(lambda x: re.sub("[,\!?/:;''()``]", '', x))
stop_words = stop_words.map(lambda x: re.sub(r"([.]+)", '', x))

# stemming stopwords
ps = PorterStemmer()
for i in range(0,len(stop_words)):
    stop_words[i] = ps.stem(stop_words[i])
In [225]:
#making stopwords back a list
stop_words = list(stop_words)

#adding some specific stopwords
stop_words.append('``')
stop_words.append("\'\'")
In [226]:
# removing stopwords from post texts
for i in range(0,len(post_texts)):
    post_texts[i] = [word for word in post_texts[i] if not word in list(stop_words)]
post_texts
Out[226]:
0      [edit, update, profile, sil, amy, always, come...
1      [nurse, practitioner, primary, care, provider,...
2      [sister, thats, years, older, parents, years, ...
3      [hello, know, beginits, absolute, nightmare, r...
4      [best, relationship, husbands, mom, since, day...
                             ...                        
795    [family, get, together, fathers, day, older, b...
796    [son, sleepover, last, night, friend, this, fr...
797    [cat, named, stargazer, adopted, around, years...
798    [wifef, im, trying, get, pregnant, deal, boy, ...
799    [super, weird, situation, need, judgement, eng...
Name: body, Length: 800, dtype: object
In [227]:
top_posts["body_clean"] = post_texts
top_posts.head()
Out[227]:
title body score id top_comment_body top_comment_score url body_clean
0 AITA for bringing my SIL’s wallet to the resta... Edit: update on profile\n\nMy (f28) SIL “Amy” ... 68512 x2k5kv NTA. Stone cold busted. Next time she books an... 1442 https://www.reddit.com/r/AmItheAsshole/comment... [edit, update, profile, sil, amy, always, come...
1 AITA for bringing up my brother's "premature" ... I am a nurse practitioner and I am the primary... 56259 zvmflw You can tell the family about the time you wer... 678 https://www.reddit.com/r/AmItheAsshole/comment... [nurse, practitioner, primary, care, provider,...
2 AITA for not taking down my video that was a g... I have a sister that’s 6 years older than me. ... 54743 wyjbjs NTA\n\nMy parents missed my wedding too all be... 1578 https://www.reddit.com/r/AmItheAsshole/comment... [sister, thats, years, older, parents, years, ...
3 UPDATE AITA for walking out of the Airport whe... Hello!.\n\n\nI don't know where to begin...it'... 51464 ur2l3s I'm sorry you are going through this, but I'm ... 18671 https://www.reddit.com/r/AmItheAsshole/comment... [hello, know, beginits, absolute, nightmare, r...
4 AITA for walking out of the Airport when I saw... \n\nI F30 don't have the best relationship wit... 50024 unhse2 Definitely NTA. You know that if you had sucke... 9416 https://www.reddit.com/r/AmItheAsshole/comment... [best, relationship, husbands, mom, since, day...
In [228]:
def generate_ngrams(text, ngram = 1):
    temp = zip(*[text[i:] for i in range(0,ngram)]) # set with pairs, three, ..., ns of tokens
    ans = [' '.join(ngram) for ngram in temp] # joins the elements in strings
    ans = pd.Series(ans).map(lambda x: re.sub(" ", '_', x)) # replaces spaces with '_'
    return list(ans)
In [229]:
for i in range(0,len(post_texts)):
    unigrams = post_texts[i]
    bigrams = generate_ngrams(post_texts[i], ngram = 2)
    trigrams = generate_ngrams(post_texts[i], ngram = 3)
    
    text = []
    text.append(unigrams)
    text.append(bigrams)
    text.append(trigrams)
    
    post_texts[i] = [item for sublist in text for item in sublist] 

post_texts
Out[229]:
0      [edit, update, profile, sil, amy, always, come...
1      [nurse, practitioner, primary, care, provider,...
2      [sister, thats, years, older, parents, years, ...
3      [hello, know, beginits, absolute, nightmare, r...
4      [best, relationship, husbands, mom, since, day...
                             ...                        
795    [family, get, together, fathers, day, older, b...
796    [son, sleepover, last, night, friend, this, fr...
797    [cat, named, stargazer, adopted, around, years...
798    [wifef, im, trying, get, pregnant, deal, boy, ...
799    [super, weird, situation, need, judgement, eng...
Name: body, Length: 800, dtype: object
In [230]:
#dictionary from gensim library = keys are: 1, 2, 3, ..., number of tokens; values are tokens' names
dictionary = corpora.Dictionary(post_texts) 

#corpus from gensim library consists of so called bows
#every bow = keys are tokens' indexes; values are numbers of tokens' occurences in text
corpus = [dictionary.doc2bow(text) for text in post_texts]
In [231]:
tfidf_model = models.TfidfModel(corpus, id2word = dictionary)
In [232]:
def TFIDF(dictionary, corpus, which_text, tfidf_model):
    bow = corpus[which_text]
    tfidfdictionary = dict(tfidf_model[bow]) #TFIDF for tokens in a chosen text
    
    #below: keys are tokens' names; values are numbers of tokens' occurences in text
    TFIDFdictionary = dict((dictionary[key], value) for (key, value) in tfidfdictionary.items())
    
    return(TFIDFdictionary)
In [233]:
TFIDF(dictionary, corpus, 0, tfidf_model)
Out[233]:
{'*': 0.02909107242391753,
 '*_make': 0.05215345646722317,
 '*_make_fair': 0.05215345646722317,
 '*_specifically': 0.05215345646722317,
 '*_specifically_*': 0.05215345646722317,
 'admit': 0.022992093374259996,
 'admit_got': 0.04674551016017217,
 'admit_got_this': 0.05215345646722317,
 'aita': 0.004934951957707041,
 'aita_taking': 0.04674551016017217,
 'aita_taking_wallet': 0.05215345646722317,
 'always': 0.027602293438124433,
 'always_comes': 0.04674551016017217,
 'always_comes_visit': 0.05215345646722317,
 'always_conveniently': 0.05215345646722317,
 'always_conveniently_forgets': 0.05215345646722317,
 'always_wants': 0.041337563853121165,
 'always_wants_go': 0.05215345646722317,
 'amount': 0.024414569317916215,
 'amount_money': 0.039596594001239084,
 'amount_money_much': 0.05215345646722317,
 'amy': 0.11452235417274861,
 'amy_always': 0.05215345646722317,
 'amy_always_comes': 0.05215345646722317,
 'amy_called': 0.05215345646722317,
 'amy_called_saw': 0.05215345646722317,
 'amy_hopefully': 0.05215345646722317,
 'amy_hopefully_reading': 0.05215345646722317,
 'asked': 0.006721951543800229,
 'asked_pay': 0.039596594001239084,
 'asked_pay_back': 0.05215345646722317,
 'asked_separate': 0.05215345646722317,
 'asked_separate_bills': 0.05215345646722317,
 'asshole': 0.014176868386002118,
 'asshole_ill': 0.05215345646722317,
 'asshole_ill_admit': 0.05215345646722317,
 'awards': 0.034188647694188085,
 'awards_jeez': 0.05215345646722317,
 'awards_jeez_lol': 0.05215345646722317,
 'back': 0.011012172367351048,
 'back_inside': 0.034188647694188085,
 'back_inside_found': 0.05215345646722317,
 'back_never': 0.05215345646722317,
 'back_never_has': 0.05215345646722317,
 'badmouthing': 0.04674551016017217,
 'badmouthing_internet': 0.05215345646722317,
 'badmouthing_internet_honestly': 0.05215345646722317,
 'because': 0.002270550677011268,
 'because_forgot': 0.05215345646722317,
 'because_forgot_wallet': 0.05215345646722317,
 'before': 0.00719783178824592,
 'before_left': 0.030048676889675205,
 'before_left_made': 0.05215345646722317,
 'bill': 0.0946904564305479,
 'bill_asked': 0.05215345646722317,
 'bill_asked_pay': 0.05215345646722317,
 'bill_because': 0.04674551016017217,
 'bill_because_forgot': 0.05215345646722317,
 'bill_this': 0.04674551016017217,
 'bill_this_might': 0.05215345646722317,
 'bills': 0.025881756084899062,
 'bills_said': 0.05215345646722317,
 'bills_said_need': 0.05215345646722317,
 'bringing': 0.022808508025950944,
 'bringing_restaurant': 0.05215345646722317,
 'bringing_restaurant_edit': 0.05215345646722317,
 'call': 0.01150895528187117,
 'called': 0.006607382417254549,
 'called_saw': 0.05215345646722317,
 'called_saw_this': 0.05215345646722317,
 'cant': 0.018592991908553364,
 'cant_keep': 0.0369714317839003,
 'cant_keep_thank': 0.05215345646722317,
 'cant_pay': 0.05215345646722317,
 'cant_pay_share': 0.05215345646722317,
 'car': 0.014801362977496785,
 'car_pretended': 0.05215345646722317,
 'car_pretended_forgot': 0.05215345646722317,
 'care': 0.013767680076847257,
 'care_amy': 0.05215345646722317,
 'care_amy_hopefully': 0.05215345646722317,
 'clear': 0.01911888418965894,
 'clear_paying': 0.05215345646722317,
 'clear_paying_bill': 0.05215345646722317,
 'come': 0.009043776943138872,
 'come_town': 0.05215345646722317,
 'come_town_nonetheless': 0.05215345646722317,
 'comes': 0.01786788805686598,
 'comes_visit': 0.05215345646722317,
 'comes_visit_town': 0.05215345646722317,
 'comments': 0.02934480215481893,
 'comments_cant': 0.04674551016017217,
 'comments_cant_keep': 0.05215345646722317,
 'comments_wake': 0.05215345646722317,
 'comments_wake_call': 0.05215345646722317,
 'conveniently': 0.05215345646722317,
 'conveniently_forgets': 0.05215345646722317,
 'conveniently_forgets_wallet': 0.05215345646722317,
 'domes': 0.05215345646722317,
 'domes_excuses': 0.05215345646722317,
 'domes_excuses_why': 0.05215345646722317,
 'done': 0.012556862465984078,
 'done_eating': 0.05215345646722317,
 'done_eating_asked': 0.05215345646722317,
 'eating': 0.020473809777848063,
 'eating_asked': 0.04674551016017217,
 'eating_asked_separate': 0.05215345646722317,
 'edit': 0.03465511850598997,
 'edit_amy': 0.05215345646722317,
 'edit_amy_called': 0.05215345646722317,
 'edit_update': 0.0369714317839003,
 'edit_update_profile': 0.05215345646722317,
 'edit_wow': 0.043582064364633875,
 'edit_wow_thanks': 0.05215345646722317,
 'episode': 0.04674551016017217,
 'episode_two': 0.05215345646722317,
 'episode_two_half': 0.05215345646722317,
 'every': 0.01176904661013826,
 'every_time': 0.024873643918128827,
 'every_time_come': 0.05215345646722317,
 'everyone': 0.010661392172983898,
 'everyone_something': 0.04674551016017217,
 'everyone_something_say': 0.05215345646722317,
 'excuses': 0.029180892558275585,
 'excuses_why': 0.043582064364633875,
 'excuses_why_cant': 0.05215345646722317,
 'expensive': 0.04422907639921285,
 'expensive_restaurant': 0.05215345646722317,
 'expensive_restaurant_last': 0.05215345646722317,
 'expensive_restaurants': 0.05215345646722317,
 'expensive_restaurants_always': 0.05215345646722317,
 'extremely': 0.04235434039577244,
 'extremely_expensive': 0.05215345646722317,
 'extremely_expensive_restaurant': 0.05215345646722317,
 'extremely_furious': 0.05215345646722317,
 'extremely_furious_said': 0.05215345646722317,
 'fair': 0.022282329985197233,
 'fair_amount': 0.04674551016017217,
 'fair_amount_money': 0.05215345646722317,
 'forgets': 0.04674551016017217,
 'forgets_wallet': 0.05215345646722317,
 'forgets_wallet_domes': 0.05215345646722317,
 'forgot': 0.05471645088696174,
 'forgot_something': 0.05215345646722317,
 'forgot_something_went': 0.05215345646722317,
 'forgot_wallet': 0.04674551016017217,
 'forgot_wallet_reached': 0.05215345646722317,
 'found': 0.010973514486475417,
 'found_wallet': 0.05215345646722317,
 'found_wallet_sitting': 0.05215345646722317,
 'furious': 0.021325784544935195,
 'furious_said': 0.039596594001239084,
 'furious_said_touched': 0.05215345646722317,
 'go': 0.004990220316228659,
 'go_expensive': 0.05215345646722317,
 'go_expensive_restaurants': 0.05215345646722317,
 'got': 0.002617364098430153,
 'got_this': 0.035929617546070165,
 'got_this_move': 0.05215345646722317,
 'grabbed': 0.025113724931968156,
 'grabbed_wallet': 0.05215345646722317,
 'grabbed_wallet_aita': 0.05215345646722317,
 'half': 0.019465697611077828,
 'half_men': 0.05215345646722317,
 'half_men_leaving': 0.05215345646722317,
 'has': 0.007178687603053892,
 'has_implied': 0.05215345646722317,
 'has_implied_since': 0.05215345646722317,
 'has_made': 0.0369714317839003,
 'has_made_reservation': 0.05215345646722317,
 'honestly': 0.018062948649659606,
 'honestly_care': 0.043582064364633875,
 'honestly_care_amy': 0.05215345646722317,
 'hopefully': 0.0315634854768493,
 'hopefully_reading': 0.05215345646722317,
 'hopefully_reading_comments': 0.05215345646722317,
 'hotel': 0.025617255591598793,
 'hotel_always': 0.05215345646722317,
 'hotel_always_wants': 0.05215345646722317,
 'husband': 0.014152225961455234,
 'husband_pay': 0.04674551016017217,
 'husband_pay_*': 0.05215345646722317,
 'husband_went': 0.0369714317839003,
 'husband_went_car': 0.05215345646722317,
 'ill': 0.01354314609701713,
 'ill_admit': 0.038174118057582876,
 'ill_admit_got': 0.05215345646722317,
 'implied': 0.034188647694188085,
 'implied_since': 0.05215345646722317,
 'implied_since_make': 0.05215345646722317,
 'inside': 0.01958481469305312,
 'inside_found': 0.05215345646722317,
 'inside_found_wallet': 0.05215345646722317,
 'instead': 0.013654605382874378,
 'instead_hotel': 0.043582064364633875,
 'instead_hotel_always': 0.05215345646722317,
 'internet': 0.0315634854768493,
 'internet_honestly': 0.05215345646722317,
 'internet_honestly_care': 0.05215345646722317,
 'jeez': 0.04674551016017217,
 'jeez_lol': 0.05215345646722317,
 'jeez_lol_many': 0.05215345646722317,
 'keep': 0.012316781452144751,
 'keep_thank': 0.05215345646722317,
 'keep_thank_everyone': 0.05215345646722317,
 'last': 0.008859468750773293,
 'last_night': 0.022808508025950944,
 'last_night_before': 0.05215345646722317,
 'leaving': 0.018062948649659606,
 'leaving_husband': 0.05215345646722317,
 'leaving_husband_went': 0.05215345646722317,
 'left': 0.0076524468185637075,
 'left_made': 0.04674551016017217,
 'left_made_clear': 0.05215345646722317,
 'lol': 0.025617255591598793,
 'lol_many': 0.05215345646722317,
 'lol_many_comments': 0.05215345646722317,
 'made': 0.013583018730921954,
 'made_clear': 0.02960272595499357,
 'made_clear_paying': 0.05215345646722317,
 'made_reservation': 0.04674551016017217,
 'made_reservation_extremely': 0.05215345646722317,
 'make': 0.014995892754891205,
 'make_fair': 0.05215345646722317,
 'make_fair_amount': 0.05215345646722317,
 'make_much': 0.038174118057582876,
 'make_much_money': 0.043582064364633875,
 'many': 0.014672401077409466,
 'many_comments': 0.038174118057582876,
 'many_comments_cant': 0.04674551016017217,
 'men': 0.025881756084899062,
 'men_leaving': 0.05215345646722317,
 'men_leaving_husband': 0.05215345646722317,
 'might': 0.016542332829378864,
 'might_asshole': 0.04674551016017217,
 'might_asshole_ill': 0.05215345646722317,
 'money': 0.023103412337326646,
 'money_much': 0.04674551016017217,
 'money_much_treat': 0.05215345646722317,
 'money_one': 0.05215345646722317,
 'money_one_pay': 0.05215345646722317,
 'move': 0.016542332829378864,
 'move_straight': 0.05215345646722317,
 'move_straight_episode': 0.05215345646722317,
 'much': 0.015252966685256148,
 'much_money': 0.033445036020718126,
 'much_money_one': 0.05215345646722317,
 'much_treat': 0.05215345646722317,
 'much_treat_someone': 0.05215345646722317,
 'need': 0.011637917181958491,
 'need_one': 0.04674551016017217,
 'need_one_bill': 0.05215345646722317,
 'never': 0.00786332498863238,
 'never_has': 0.05215345646722317,
 'never_has_made': 0.05215345646722317,
 'night': 0.011637917181958491,
 'night_before': 0.034188647694188085,
 'night_before_left': 0.05215345646722317,
 'nonetheless': 0.043582064364633875,
 'nonetheless_past': 0.05215345646722317,
 'nonetheless_past_paid': 0.05215345646722317,
 'one': 0.008499941713390111,
 'one_bill': 0.05215345646722317,
 'one_bill_because': 0.05215345646722317,
 'one_pay': 0.05215345646722317,
 'one_pay_husband': 0.05215345646722317,
 'paid': 0.018786833340891568,
 'paid_bill': 0.04674551016017217,
 'paid_bill_asked': 0.05215345646722317,
 'past': 0.01584317721532692,
 'past_paid': 0.05215345646722317,
 'past_paid_bill': 0.05215345646722317,
 'pay': 0.05062000937043441,
 'pay_*': 0.05215345646722317,
 'pay_*_specifically': 0.05215345646722317,
 'pay_back': 0.0321416771590372,
 'pay_back_never': 0.05215345646722317,
 'pay_husband': 0.05215345646722317,
 'pay_husband_pay': 0.05215345646722317,
 'pay_share': 0.05215345646722317,
 'pay_share_has': 0.05215345646722317,
 'paying': 0.01767758551716605,
 'paying_bill': 0.05215345646722317,
 'paying_bill_this': 0.05215345646722317,
 'post': 0.017491814423691122,
 'post_yelled': 0.05215345646722317,
 'post_yelled_badmouthing': 0.05215345646722317,
 'pretended': 0.035929617546070165,
 'pretended_forgot': 0.05215345646722317,
 'pretended_forgot_something': 0.05215345646722317,
 'profile': 0.03276617175053187,
 'profile_sil': 0.05215345646722317,
 'profile_sil_amy': 0.05215345646722317,
 'purse': 0.07002134452408915,
 'purse_said': 0.043582064364633875,
 'purse_said_this': 0.05215345646722317,
 'purse_went': 0.04674551016017217,
 'purse_went_restaurant': 0.05215345646722317,
 'put': 0.010435230908275916,
 'put_purse': 0.05215345646722317,
 'put_purse_went': 0.05215345646722317,
 'reached': 0.024194779647942567,
 'reached_purse': 0.05215345646722317,
 'reached_purse_said': 0.05215345646722317,
 'reading': 0.026733730851986198,
 'reading_comments': 0.035929617546070165,
 'reading_comments_wake': 0.05215345646722317,
 'reservation': 0.038174118057582876,
 'reservation_extremely': 0.05215345646722317,
 'reservation_extremely_expensive': 0.05215345646722317,
 'restaurant': 0.06585083740928958,
 'restaurant_done': 0.05215345646722317,
 'restaurant_done_eating': 0.05215345646722317,
 'restaurant_edit': 0.05215345646722317,
 'restaurant_edit_wow': 0.05215345646722317,
 'restaurant_last': 0.05215345646722317,
 'restaurant_last_night': 0.05215345646722317,
 'restaurants': 0.041337563853121165,
 'restaurants_always': 0.05215345646722317,
 'restaurants_always_conveniently': 0.05215345646722317,
 'right': 0.009491556547070306,
 'right_top': 0.04674551016017217,
 'right_top_suitcase': 0.05215345646722317,
 'said': 0.0036324528325895316,
 'said_need': 0.034188647694188085,
 'said_need_one': 0.05215345646722317,
 'said_this': 0.02163178522820401,
 'said_this_wallet': 0.05215345646722317,
 'said_touched': 0.05215345646722317,
 'said_touched_grabbed': 0.05215345646722317,
 'saw': 0.012176200760157992,
 'saw_this': 0.038174118057582876,
 'saw_this_post': 0.05215345646722317,
 'say': 0.008135199789966125,
 'say_edit': 0.04674551016017217,
 'say_edit_amy': 0.05215345646722317,
 'separate': 0.027690276292248236,
 'separate_bills': 0.05215345646722317,
 'separate_bills_said': 0.05215345646722317,
 'share': 0.02088817355473405,
 'share_has': 0.05215345646722317,
 'share_has_implied': 0.05215345646722317,
 'sil': 0.023772946251224582,
 'sil_amy': 0.05215345646722317,
 'sil_amy_always': 0.05215345646722317,
 'since': 0.004843699252915866,
 'since_make': 0.04674551016017217,
 'since_make_much': 0.05215345646722317,
 'sitting': 0.019348371831445058,
 'sitting_right': 0.05215345646722317,
 'sitting_right_top': 0.05215345646722317,
 'someone': 0.014420701271670697,
 'someone_every': 0.05215345646722317,
 'someone_every_time': 0.05215345646722317,
 'something': 0.018025509929239392,
 'something_say': 0.041337563853121165,
 'something_say_edit': 0.05215345646722317,
 'something_went': 0.04674551016017217,
 'something_went_back': 0.05215345646722317,
 'specifically': 0.030048676889675205,
 'specifically_*': 0.05215345646722317,
 'specifically_*_make': 0.05215345646722317,
 'stays': 0.03052167123901916,
 'stays_us': 0.04674551016017217,
 'stays_us_instead': 0.05215345646722317,
 'straight': 0.025113724931968156,
 'straight_episode': 0.05215345646722317,
 'straight_episode_two': 0.05215345646722317,
 'suitcase': 0.043582064364633875,
 'suitcase_put': 0.05215345646722317,
 'suitcase_put_purse': 0.05215345646722317,
 'taking': 0.014866652206352325,
 'taking_wallet': 0.05215345646722317,
 'taking_wallet_bringing': 0.05215345646722317,
 'thank': 0.01679003672347147,
 'thank_everyone': 0.025881756084899062,
 'thank_everyone_something': 0.05215345646722317,
 'thanks': 0.021789407100577425,
 'thanks_awards': 0.05215345646722317,
 'thanks_awards_jeez': 0.05215345646722317,
 'this': 0.004752284754299111,
 'this_might': 0.03276617175053187,
 'this_might_asshole': 0.05215345646722317,
 'this_move': 0.043582064364633875,
 'this_move_straight': 0.05215345646722317,
 'this_post': 0.02735822544348087,
 'this_post_yelled': 0.05215345646722317,
 'this_wallet': 0.05215345646722317,
 'this_wallet_was': 0.05215345646722317,
 'time': 0.0038885496472256796,
 'time_come': 0.04674551016017217,
 'time_come_town': 0.05215345646722317,
 'top': 0.025617255591598793,
 'top_suitcase': 0.05215345646722317,
 'top_suitcase_put': 0.05215345646722317,
 'touched': 0.038174118057582876,
 'touched_grabbed': 0.05215345646722317,
 'touched_grabbed_wallet': 0.05215345646722317,
 'town': 0.048389559295885134,
 'town_nonetheless': 0.05215345646722317,
 'town_nonetheless_past': 0.05215345646722317,
 'town_stays': 0.05215345646722317,
 'town_stays_us': 0.05215345646722317,
 'treat': 0.02735822544348087,
 'treat_someone': 0.05215345646722317,
 'treat_someone_every': 0.05215345646722317,
 'two': 0.010933809478042688,
 'two_half': 0.04674551016017217,
 'two_half_men': 0.05215345646722317,
 'update': 0.015695964974008842,
 'update_profile': 0.043582064364633875,
 'update_profile_sil': 0.05215345646722317,
 'us': 0.006791509365460977,
 'us_instead': 0.05215345646722317,
 'us_instead_hotel': 0.05215345646722317,
 'visit': 0.019465697611077828,
 'visit_town': 0.05215345646722317,
 'visit_town_stays': 0.05215345646722317,
 'wake': 0.02960272595499357,
 'wake_call': 0.04674551016017217,
 'wallet': 0.26149238618780324,
 'wallet_aita': 0.05215345646722317,
 'wallet_aita_taking': 0.05215345646722317,
 'wallet_bringing': 0.05215345646722317,
 'wallet_bringing_restaurant': 0.05215345646722317,
 'wallet_domes': 0.05215345646722317,
 'wallet_domes_excuses': 0.05215345646722317,
 'wallet_reached': 0.05215345646722317,
 'wallet_reached_purse': 0.05215345646722317,
 'wallet_sitting': 0.05215345646722317,
 'wallet_sitting_right': 0.05215345646722317,
 'wallet_was': 0.05215345646722317,
 'wallet_was_extremely': 0.05215345646722317,
 'wants': 0.014932492405676552,
 'wants_go': 0.039596594001239084,
 'wants_go_expensive': 0.05215345646722317,
 'was': 0.00017755040764926922,
 'was_extremely': 0.033445036020718126,
 'was_extremely_furious': 0.05215345646722317,
 'went': 0.014368366730440266,
 'went_back': 0.025617255591598793,
 'went_back_inside': 0.043582064364633875,
 'went_car': 0.039596594001239084,
 'went_car_pretended': 0.05215345646722317,
 'went_restaurant': 0.041337563853121165,
 'went_restaurant_done': 0.05215345646722317,
 'why': 0.008107582000707988,
 'why_cant': 0.035929617546070165,
 'why_cant_pay': 0.05215345646722317,
 'wow': 0.027039731535255008,
 'wow_thanks': 0.05215345646722317,
 'wow_thanks_awards': 0.05215345646722317,
 'yelled': 0.016624030092291506,
 'yelled_badmouthing': 0.05215345646722317,
 'yelled_badmouthing_internet': 0.05215345646722317}
In [234]:
d_tfidf = {}

for i in range(0,len(corpus)): # for each text
    data = TFIDF(dictionary, corpus, i, tfidf_model) # calculate TFIDF values for text's tokens

    for token, value in data.items(): # next, for each token and its TFIDF value in text, prepare a dictionary
                                      # with tokens' names as keys and list of TF-IDFs as values
        d_tfidf.setdefault(token, []).append(value)
In [235]:
tfidf_values = [item for sublist in list(d_tfidf.values()) for item in sublist]

plt.hist(tfidf_values, bins=1000)
plt.xlabel('TF-IDF')
plt.ylabel('Number of tokens with certain TF-IDF value')
plt.xlim([0, 0.1])
plt.show()
In [236]:
for i in [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5]:
    print('Quantile ',i*100,'%: ',np.quantile(tfidf_values,i),sep='')
Quantile 1.0%: 0.003735010551285883
Quantile 2.0%: 0.0052788350576095724
Quantile 3.0%: 0.006499689239911252
Quantile 4.0%: 0.0076187378989144745
Quantile 5.0%: 0.008691952582100246
Quantile 10.0%: 0.013784480315353988
Quantile 20.0%: 0.023034908582879814
Quantile 30.0%: 0.02915079373277821
Quantile 40.0%: 0.03479737109459152
Quantile 50.0%: 0.038053893410117355
In [237]:
import pickle # for saving objects

import pandas as pd
import matplotlib.pyplot as plt

import plotly.express as px # for nice plotting

import warnings

import math

from nltk.tokenize import RegexpTokenizer # for LSA in sklearn, we will need additional tokenizer

from sklearn.feature_extraction.text import CountVectorizer # one can consider LSA with DF in DTM...
from sklearn.feature_extraction.text import TfidfVectorizer # or with TF-IDF values in DTM

from sklearn.decomposition import LatentDirichletAllocation # LDA implementation

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
In [238]:
# as our preprocessed data is already tokenized
# therefore, we need to make them strings again...

def listToString(s):  
    str1 = ""   
    for ele in s:  
        str1 += ele+" "    
    return str1  

top_posts["body_clean_str"] = top_posts["body_clean"] # new column, for now a copy of tokenized and preprocessed texts
for i in range(0,len(top_posts)):
    top_posts["body_clean_str"][i] = listToString(top_posts["body_clean_str"][i])
    
top_posts.head()
Out[238]:
title body score id top_comment_body top_comment_score url body_clean body_clean_str
0 AITA for bringing my SIL’s wallet to the resta... Edit: update on profile\n\nMy (f28) SIL “Amy” ... 68512 x2k5kv NTA. Stone cold busted. Next time she books an... 1442 https://www.reddit.com/r/AmItheAsshole/comment... [edit, update, profile, sil, amy, always, come... edit update profile sil amy always comes visit...
1 AITA for bringing up my brother's "premature" ... I am a nurse practitioner and I am the primary... 56259 zvmflw You can tell the family about the time you wer... 678 https://www.reddit.com/r/AmItheAsshole/comment... [nurse, practitioner, primary, care, provider,... nurse practitioner primary care provider lot l...
2 AITA for not taking down my video that was a g... I have a sister that’s 6 years older than me. ... 54743 wyjbjs NTA\n\nMy parents missed my wedding too all be... 1578 https://www.reddit.com/r/AmItheAsshole/comment... [sister, thats, years, older, parents, years, ... sister thats years older parents years cancel ...
3 UPDATE AITA for walking out of the Airport whe... Hello!.\n\n\nI don't know where to begin...it'... 51464 ur2l3s I'm sorry you are going through this, but I'm ... 18671 https://www.reddit.com/r/AmItheAsshole/comment... [hello, know, beginits, absolute, nightmare, r... hello know beginits absolute nightmare recentl...
4 AITA for walking out of the Airport when I saw... \n\nI F30 don't have the best relationship wit... 50024 unhse2 Definitely NTA. You know that if you had sucke... 9416 https://www.reddit.com/r/AmItheAsshole/comment... [best, relationship, husbands, mom, since, day... best relationship husbands mom since day one t...
In [239]:
warnings.filterwarnings("ignore") #ignoring popping up warnings

tokenizer = RegexpTokenizer(r'\w+') # tokenizer

tf_vectorizer = CountVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
                                max_df = 0.75, #filtering with document frequency
                                min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
                                tokenizer = tokenizer.tokenize
)
tf = tf_vectorizer.fit_transform(top_posts["body_clean_str"])
tf_feature_names = tf_vectorizer.get_feature_names()

tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 3), #let us use unigrams for now, to make the calculations quicker
                                   max_df = 0.75, #filtering with document frequency
                                   min_df = 5/len(top_posts["body_clean_str"]), #filtering with document frequency
                                   tokenizer = tokenizer.tokenize
)
tfidf = tfidf_vectorizer.fit_transform(top_posts["body_clean_str"])
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
In [240]:
def get_umass_score(dt_matrix, i, j):
    zo_matrix = (dt_matrix > 0).astype(int)
    col_i, col_j = zo_matrix[:, i], zo_matrix[:, j]
    col_ij = col_i + col_j
    col_ij = (col_ij == 2).astype(int)    
    Di, Dij = col_i.sum(), col_ij.sum()    
    return math.log((Dij + 1) / Di)

def get_topic_coherence(dt_matrix, topic, n_top_words):
    indexed_topic = zip(topic, range(0, len(topic)))
    topic_top = sorted(indexed_topic, key=lambda x: 1 - x[0])[0:n_top_words]
    coherence = 0
    for j_index in range(0, len(topic_top)):
        for i_index in range(0, j_index - 1):
            i = topic_top[i_index][1]
            j = topic_top[j_index][1]
            coherence += get_umass_score(dt_matrix, i, j)
    return coherence

def get_average_topic_coherence(dt_matrix, topics, n_top_words):
    total_coherence = 0
    for i in range(0, len(topics)):
        total_coherence += get_topic_coherence(dt_matrix, topics[i], n_top_words)
    return total_coherence / len(topics)
In [241]:
measures_specific = []

for n_topics in range(2,51,1):
    
    print('Trying parameters:', n_topics)
    
    lda = LatentDirichletAllocation(n_components = n_topics, 
                                    learning_method = 'online',
                                    learning_offset = 50.0,
                                    max_iter = 5, 
                                    random_state = 42)
                
    lda.fit(tf)
    
    avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)

    measures_specific.append([avg_coherence, n_topics])
Trying parameters: 2
Trying parameters: 3
Trying parameters: 4
Trying parameters: 5
Trying parameters: 6
Trying parameters: 7
Trying parameters: 8
Trying parameters: 9
Trying parameters: 10
Trying parameters: 11
Trying parameters: 12
Trying parameters: 13
Trying parameters: 14
Trying parameters: 15
Trying parameters: 16
Trying parameters: 17
Trying parameters: 18
Trying parameters: 19
Trying parameters: 20
Trying parameters: 21
Trying parameters: 22
Trying parameters: 23
Trying parameters: 24
Trying parameters: 25
Trying parameters: 26
Trying parameters: 27
Trying parameters: 28
Trying parameters: 29
Trying parameters: 30
Trying parameters: 31
Trying parameters: 32
Trying parameters: 33
Trying parameters: 34
Trying parameters: 35
Trying parameters: 36
Trying parameters: 37
Trying parameters: 38
Trying parameters: 39
Trying parameters: 40
Trying parameters: 41
Trying parameters: 42
Trying parameters: 43
Trying parameters: 44
Trying parameters: 45
Trying parameters: 46
Trying parameters: 47
Trying parameters: 48
Trying parameters: 49
Trying parameters: 50
In [242]:
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_df_lda = pd.DataFrame(measures_specific).rename(columns={
    0: 'avg_coherence', 1: 'n_topics'
})

save_object(measures_specific_df_lda, 'TM_project/measures_specific_df_lda.pkl')
In [243]:
with open("TM_project/measures_specific_df_lda.pkl", "rb") as fp:
    measures_specific_df_lda = pickle.load(fp)
In [244]:
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_df_lda['n_topics'],measures_specific_df_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
In [245]:
measures_specific_df_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
Out[245]:
avg_coherence n_topics
0 -197.258110 2
6 -218.464319 8
9 -223.506711 11
10 -225.799363 12
1 -227.682847 3
11 -230.803568 13
12 -232.100892 14
7 -237.269628 9
8 -239.096575 10
In [246]:
warnings.filterwarnings("ignore") #ignoring popping up warnings

measures_specific = []

for n_topics in range(2,51,1):
    
    print('Trying parameters:', n_topics)
    
    lda = LatentDirichletAllocation(n_components = n_topics, 
                                    learning_method = 'online',
                                    learning_offset = 50.0,
                                    max_iter = 5, 
                                    random_state = 42)
                
    lda.fit(tfidf)
    
    avg_coherence = get_average_topic_coherence(tfidf, lda.components_, 25)

    measures_specific.append([avg_coherence, n_topics])
Trying parameters: 2
Trying parameters: 3
Trying parameters: 4
Trying parameters: 5
Trying parameters: 6
Trying parameters: 7
Trying parameters: 8
Trying parameters: 9
Trying parameters: 10
Trying parameters: 11
Trying parameters: 12
Trying parameters: 13
Trying parameters: 14
Trying parameters: 15
Trying parameters: 16
Trying parameters: 17
Trying parameters: 18
Trying parameters: 19
Trying parameters: 20
Trying parameters: 21
Trying parameters: 22
Trying parameters: 23
Trying parameters: 24
Trying parameters: 25
Trying parameters: 26
Trying parameters: 27
Trying parameters: 28
Trying parameters: 29
Trying parameters: 30
Trying parameters: 31
Trying parameters: 32
Trying parameters: 33
Trying parameters: 34
Trying parameters: 35
Trying parameters: 36
Trying parameters: 37
Trying parameters: 38
Trying parameters: 39
Trying parameters: 40
Trying parameters: 41
Trying parameters: 42
Trying parameters: 43
Trying parameters: 44
Trying parameters: 45
Trying parameters: 46
Trying parameters: 47
Trying parameters: 48
Trying parameters: 49
Trying parameters: 50
In [247]:
# below, we make the output (list) a pandas DataFrame with intuitive colnames
measures_specific_tfidf_lda = pd.DataFrame(measures_specific).rename(columns={
    0: 'avg_coherence', 1: 'n_topics'
})

save_object(measures_specific_tfidf_lda, 'TM_project/measures_specific_tfidf_lda.pkl')
In [248]:
with open("TM_project/measures_specific_tfidf_lda.pkl", "rb") as fp:
    measures_specific_tfidf_lda = pickle.load(fp)
In [249]:
plt.style.use("fivethirtyeight")
plt.plot(measures_specific_tfidf_lda['n_topics'],measures_specific_tfidf_lda['avg_coherence'])
plt.xlabel("No. of topics")
plt.ylabel("Average topic coherence")
plt.show()
In [250]:
measures_specific_tfidf_lda.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
Out[250]:
avg_coherence n_topics
0 -235.322070 2
3 -335.975590 5
1 -415.732856 3
4 -419.810625 6
2 -481.555180 4
5 -487.373452 7
7 -501.306549 9
9 -504.806318 11
12 -513.255496 14
In [132]:
# here we consider the previously presented LatentDirichletAllocation() function, still with less parameters
lda = LatentDirichletAllocation(n_components = 11, 
                                learning_method = 'online', 
                                learning_offset = 80.0,
                                max_iter = 5, 
                                random_state = 42)
lda.fit(tfidf)
Out[132]:
LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
                          max_iter=5, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LatentDirichletAllocation(learning_method='online', learning_offset=80.0,
                          max_iter=5, random_state=42)
In [ ]:
for index, component in enumerate(lda.components_): #taking model's components 
                                                    #(values from reconstructed Document-Term Matrix)
    zipped = zip(tf_feature_names, component) #taking together tokens' names with components
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
    top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
    
    print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0:  ['his', 'his mom', 'college', 'school', 'princess', 'asked why said', 'family', 'girls', 'mom', 'going']
Topic 1:  ['listen', 'anymore', 'hes', 'big', 'his', 'asking was', 'begging', 'wedding', 'special', 'celebratory dinner']
Topic 2:  ['adam', 'work', 'shouting', 'think its', 'this morning', 'dog', 'very much', 'drink', 'fair', 'would like']
Topic 3:  ['his', 'family', 'house', 'hold', 'started', 'like', 'get', 'husband', 'mom', 'friend']
Topic 4:  ['approached', 'full time', 'year old daughter', 'welcome', 'dead', 'couple days', 'cancelled', 'comment', 'bother', 'son his']
Topic 5:  ['his', 'husband', 'im', 'mom', 'sister', 'family', 'like', 'wife', 'get', 'parents']
Topic 6:  ['his', 'brother', 'blue', 'years', 'new', 'im', 'dead', 'job', 'husband', 'fund']
Topic 7:  ['im', 'sister', 'carrying', 'parents', 'went', 'ive', 'was serious', 'income', 'saturday', 'still']
Topic 8:  ['son', 'husband', 'tradition', 'family dinner', 'said was', 'dress', 'always', 'his', 'give', 'heart']
Topic 9:  ['want', 'husband', 'daughter', 'made', 'its', 'gone', 'cake', 'wife', 'thats', 'any']
In [251]:
params = []
for alpha in [0.0001, 0.001, 0.01, 0.05, 0.1]:
    for beta in [0.0001, 0.001, 0.01, 0.05, 0.1]:
        for vectorizer_name in ['tf','tf-idf']:
            
            if(vectorizer_name == 'tf'):
                print(alpha, beta, 'tf')
                lda = LatentDirichletAllocation(n_components = 11, 
                                                doc_topic_prior = alpha,
                                                topic_word_prior = beta,
                                                learning_method = 'online', 
                                                learning_offset = 10.0,
                                                max_iter = 5, 
                                                random_state = 42)
                lda.fit(tf)
                avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
                params.append([alpha, beta, 'tf', avg_coherence])
                    
            if(vectorizer_name == 'tf-idf'):
                print(alpha, beta, 'tf-idf')
                lda = LatentDirichletAllocation(n_components = 11, 
                                                doc_topic_prior = alpha,
                                                topic_word_prior = beta,
                                                learning_method = 'online', 
                                                learning_offset = 10.0,
                                                max_iter = 5, 
                                                random_state = 42)
                lda.fit(tfidf)
                avg_coherence = get_average_topic_coherence(tf, lda.components_, 25)
                params.append([alpha, beta, 'tf-idf', avg_coherence])
0.0001 0.0001 tf
0.0001 0.0001 tf-idf
0.0001 0.001 tf
0.0001 0.001 tf-idf
0.0001 0.01 tf
0.0001 0.01 tf-idf
0.0001 0.05 tf
0.0001 0.05 tf-idf
0.0001 0.1 tf
0.0001 0.1 tf-idf
0.001 0.0001 tf
0.001 0.0001 tf-idf
0.001 0.001 tf
0.001 0.001 tf-idf
0.001 0.01 tf
0.001 0.01 tf-idf
0.001 0.05 tf
0.001 0.05 tf-idf
0.001 0.1 tf
0.001 0.1 tf-idf
0.01 0.0001 tf
0.01 0.0001 tf-idf
0.01 0.001 tf
0.01 0.001 tf-idf
0.01 0.01 tf
0.01 0.01 tf-idf
0.01 0.05 tf
0.01 0.05 tf-idf
0.01 0.1 tf
0.01 0.1 tf-idf
0.05 0.0001 tf
0.05 0.0001 tf-idf
0.05 0.001 tf
0.05 0.001 tf-idf
0.05 0.01 tf
0.05 0.01 tf-idf
0.05 0.05 tf
0.05 0.05 tf-idf
0.05 0.1 tf
0.05 0.1 tf-idf
0.1 0.0001 tf
0.1 0.0001 tf-idf
0.1 0.001 tf
0.1 0.001 tf-idf
0.1 0.01 tf
0.1 0.01 tf-idf
0.1 0.05 tf
0.1 0.05 tf-idf
0.1 0.1 tf
0.1 0.1 tf-idf
In [252]:
# below, we make the output (list) a pandas DataFrame with intuitive colnames
params_df = pd.DataFrame(params).rename(columns={
    0: 'alpha', 1: 'beta', 2: 'vectorizer', 3: 'avg_coherence'
})

save_object(params_df, 'TM_project/params_df.pkl')
In [253]:
with open("TM_project/params_df.pkl", "rb") as fp:
    params_df = pickle.load(fp)
In [254]:
params_df.sort_values('avg_coherence', ascending = False).iloc[0:9,:]
Out[254]:
alpha beta vectorizer avg_coherence
38 0.0500 0.10 tf -355.996279
46 0.1000 0.05 tf -356.302434
36 0.0500 0.05 tf -358.401506
28 0.0100 0.10 tf -358.964835
18 0.0010 0.10 tf -359.791277
8 0.0001 0.10 tf -359.829972
4 0.0001 0.01 tf -366.320263
14 0.0010 0.01 tf -366.845765
44 0.1000 0.01 tf -369.176138
In [255]:
fig = px.scatter(params_df[params_df['vectorizer']=='tf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
In [256]:
fig = px.scatter(params_df[params_df['vectorizer']=='tf-idf'], x="alpha", y="beta", color="avg_coherence")
fig.show()
In [260]:
lda = LatentDirichletAllocation(n_components = 11, # let us stay with 11, as that is what topic coherence initially recommended 
                                doc_topic_prior = 0.0500,
                                topic_word_prior = 0.10,
                                learning_method = 'online', 
                                learning_offset = 10.0,
                                max_iter = 20, 
                                random_state = 42)
lda.fit(tf) # TF for now

topics_lists = []

for index, component in enumerate(lda.components_): #taking model's components 
                                                    #(values from reconstructed Document-Term Matrix)
    zipped = zip(tf_feature_names, component) #taking together tokens' names with components
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:10] #top 10 terms per topic
    top_terms_list=list(dict(top_terms_key).keys()) #taking only tokens, no weights
    
    topics_lists.append(top_terms_list)
    print("Topic "+str(index)+": ",top_terms_list) #prints top 10 tokens per topic
Topic 0:  ['his', 'im', 'because', 'like', 'husband', 'got', 'get', 'time', 'its', 'has']
Topic 1:  ['gold', 'digger', 'gold digger', 'bf', 'hold', 'high', 'route', 'suppose', 'correctly', 'seconds']
Topic 2:  ['im', 'his', 'because', 'like', 'get', 'its', 'one', 'got', 'even', 'before']
Topic 3:  ['dress', 'wedding', 'wear', 'makeup', 'wearing', 'bride', 'wedding dress', 'dresses', 'color', 'bridesmaids']
Topic 4:  ['his', 'daughter', 'daughter asked', 'friend', 'got', 'because', 'since', 'could', 'asked', 'land']
Topic 5:  ['seat', 'seats', 'tattoo', 'flight', 'plane', 'his seat', 'attendant', 'exactly', 'next', 'brother law']
Topic 6:  ['his', 'his mom', 'mom', 'got', 'husband', 'home', 'us', 'get', 'like', 'one']
Topic 7:  ['amy', 'ava', 'lisa', 'watch', 'italian', 'bill', 'sex', 'grandfather', 'profile', 'family']
Topic 8:  ['his', 'family', 'because', 'mom', 'would', 'wife', 'got', 'has', 'dad', 'like']
Topic 9:  ['hair', 'jake', 'police', 'black', 'back', 'went', 'asked', 'name', 'office', 'coworkers']
Topic 10:  ['his', 'snooping', 'husband', 'his mom', 'asked', 'got', 'im', 'bedroom', 'since', 'mom']
In [264]:
import os
import openai
from IPython.display import Image
from IPython import display
from base64 import b64decode
In [265]:
openai.api_key = "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"
In [278]:
images = []

for i in range(len(topics_lists)):
    
    try:
        topic_prompt = " ".join(topics_lists[i])

        response = openai.Image.create(
            prompt=topic_prompt,
            n=1,
            size="512x512",
            response_format="b64_json"
        )

        images.append((i,response['data'][0]['b64_json']))

        print(i)
    except:
        images.append((i, np.nan))
        print(i)
        print("too NSFW for OpenAI")
0
1
2
3
4
5
6
7
too NSFW for OpenAI
8
9
10
In [279]:
image_df = pd.DataFrame(images, columns =['topic', 'image'])
image_df["words"] = topics_lists

image_df.head()
Out[279]:
topic image words
0 0 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [his, im, because, like, husband, got, get, ti...
1 1 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [gold, digger, gold digger, bf, hold, high, ro...
2 2 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [im, his, because, like, get, its, one, got, e...
3 3 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [dress, wedding, wear, makeup, wearing, bride,...
4 4 iVBORw0KGgoAAAANSUhEUgAAAgAAAAIACAIAAAB7GkOtAA... [his, daughter, daughter asked, friend, got, b...
In [280]:
len(image_df)
Out[280]:
11
In [281]:
for i in range(len(image_df)):
    print("Image for topic ", i, " with words:")
    print(image_df.loc[i,"words"])
    try:
        display.display(display.Image(b64decode(image_df.loc[i,"image"])))
    except:
        print("Too NSFW for OpenAI")
Image for topic  0  with words:
['his', 'im', 'because', 'like', 'husband', 'got', 'get', 'time', 'its', 'has']
Image for topic  1  with words:
['gold', 'digger', 'gold digger', 'bf', 'hold', 'high', 'route', 'suppose', 'correctly', 'seconds']
Image for topic  2  with words:
['im', 'his', 'because', 'like', 'get', 'its', 'one', 'got', 'even', 'before']
Image for topic  3  with words:
['dress', 'wedding', 'wear', 'makeup', 'wearing', 'bride', 'wedding dress', 'dresses', 'color', 'bridesmaids']
Image for topic  4  with words:
['his', 'daughter', 'daughter asked', 'friend', 'got', 'because', 'since', 'could', 'asked', 'land']
Image for topic  5  with words:
['seat', 'seats', 'tattoo', 'flight', 'plane', 'his seat', 'attendant', 'exactly', 'next', 'brother law']
Image for topic  6  with words:
['his', 'his mom', 'mom', 'got', 'husband', 'home', 'us', 'get', 'like', 'one']
Image for topic  7  with words:
['amy', 'ava', 'lisa', 'watch', 'italian', 'bill', 'sex', 'grandfather', 'profile', 'family']
Too NSFW for OpenAI
Image for topic  8  with words:
['his', 'family', 'because', 'mom', 'would', 'wife', 'got', 'has', 'dad', 'like']
Image for topic  9  with words:
['hair', 'jake', 'police', 'black', 'back', 'went', 'asked', 'name', 'office', 'coworkers']
Image for topic  10  with words:
['his', 'snooping', 'husband', 'his mom', 'asked', 'got', 'im', 'bedroom', 'since', 'mom']
In [282]:
df_topics_for_posts = pd.DataFrame(lda.transform(tf).tolist())

df_topics_for_posts.head()
Out[282]:
0 1 2 3 4 5 6 7 8 9 10
0 0.902614 0.000288 0.000288 0.000288 0.000288 0.000288 0.000288 0.052027 0.032663 0.010679 0.000288
1 0.000188 0.000188 0.000188 0.000188 0.000188 0.000188 0.000188 0.290750 0.707562 0.000188 0.000188
2 0.105910 0.000203 0.000203 0.000203 0.000203 0.000203 0.000203 0.000203 0.800131 0.092337 0.000203
3 0.821660 0.000164 0.000164 0.000164 0.000164 0.000164 0.000164 0.000164 0.176868 0.000164 0.000164
4 0.942130 0.000163 0.000163 0.000163 0.000163 0.000163 0.000163 0.000163 0.056407 0.000163 0.000163
In [283]:
top_posts_final = pd.merge(top_posts, round(df_topics_for_posts*100, 3), left_index=True, right_index=True)

top_posts_final.head()
Out[283]:
title body score id top_comment_body top_comment_score url body_clean body_clean_str 0 1 2 3 4 5 6 7 8 9 10
0 AITA for bringing my SIL’s wallet to the resta... Edit: update on profile\n\nMy (f28) SIL “Amy” ... 68512 x2k5kv NTA. Stone cold busted. Next time she books an... 1442 https://www.reddit.com/r/AmItheAsshole/comment... [edit, update, profile, sil, amy, always, come... edit update profile sil amy always comes visit... 90.261 0.029 0.029 0.029 0.029 0.029 0.029 5.203 3.266 1.068 0.029
1 AITA for bringing up my brother's "premature" ... I am a nurse practitioner and I am the primary... 56259 zvmflw You can tell the family about the time you wer... 678 https://www.reddit.com/r/AmItheAsshole/comment... [nurse, practitioner, primary, care, provider,... nurse practitioner primary care provider lot l... 0.019 0.019 0.019 0.019 0.019 0.019 0.019 29.075 70.756 0.019 0.019
2 AITA for not taking down my video that was a g... I have a sister that’s 6 years older than me. ... 54743 wyjbjs NTA\n\nMy parents missed my wedding too all be... 1578 https://www.reddit.com/r/AmItheAsshole/comment... [sister, thats, years, older, parents, years, ... sister thats years older parents years cancel ... 10.591 0.020 0.020 0.020 0.020 0.020 0.020 0.020 80.013 9.234 0.020
3 UPDATE AITA for walking out of the Airport whe... Hello!.\n\n\nI don't know where to begin...it'... 51464 ur2l3s I'm sorry you are going through this, but I'm ... 18671 https://www.reddit.com/r/AmItheAsshole/comment... [hello, know, beginits, absolute, nightmare, r... hello know beginits absolute nightmare recentl... 82.166 0.016 0.016 0.016 0.016 0.016 0.016 0.016 17.687 0.016 0.016
4 AITA for walking out of the Airport when I saw... \n\nI F30 don't have the best relationship wit... 50024 unhse2 Definitely NTA. You know that if you had sucke... 9416 https://www.reddit.com/r/AmItheAsshole/comment... [best, relationship, husbands, mom, since, day... best relationship husbands mom since day one t... 94.213 0.016 0.016 0.016 0.016 0.016 0.016 0.016 5.641 0.016 0.016
In [284]:
save_object(top_posts_final, 'TM_project/final_df.pkl')
In [214]:
os.system('jupyter nbconvert --to html TM_project/Code_for_LDA.ipynb')
[NbConvertApp] Converting notebook TM_project/Code_for_LDA.ipynb to html
[NbConvertApp] Writing 10251414 bytes to TM_project/Code_for_LDA.html
Out[214]:
0